Bigfoot

Author

Aaron Guerra

if (!require("rspat")) remotes::install_github("rspatial/rspat")
## Loading required package: rspat
## Loading required package: terra
## terra 1.8.97
if (!require("predicts")) install.packages("predicts")
## Loading required package: predicts
if (!require("geodata")) install.packages("geodata")

library(terra)
library(rspat)
bf <- spat_data("bigfoot")
dim(bf)
[1] 3092    3
head(bf)
        lon      lat Class
1 -142.9000 61.50000     A
2 -132.7982 55.18720     A
3 -132.8202 55.20350     A
4 -141.5667 62.93750     A
5 -149.7853 61.05950     A
6 -141.3165 62.77335     A
plot(bf[,1:2], cex=0.5, col="red")
library(geodata)
wrld <- geodata::world(path=".")
bnds <- wrld[wrld$NAME_0 %in% c("Canada", "Mexico", "United States"), ]
lines(bnds)

wc <- geodata::worldclim_global("bio", res=10, ".")
plot(wc[[c(1, 12)]], nr=2)

bfc <- extract(wc, bf[,1:2])
head(bfc, 3)
  ID wc2.1_10m_bio_1 wc2.1_10m_bio_2 wc2.1_10m_bio_3 wc2.1_10m_bio_4
1  1       -1.832979       12.504708        28.95899       1152.4308
2  2        6.360650        5.865935        32.27475        462.5731
3  3        6.360650        5.865935        32.27475        462.5731
  wc2.1_10m_bio_5 wc2.1_10m_bio_6 wc2.1_10m_bio_7 wc2.1_10m_bio_8
1        20.34075      -22.840000        43.18075        5.327750
2        16.65505       -1.519947        18.17500        3.964495
3        16.65505       -1.519947        18.17500        3.964495
  wc2.1_10m_bio_9 wc2.1_10m_bio_10 wc2.1_10m_bio_11 wc2.1_10m_bio_12
1      -0.6887083         11.80792       -16.038542              991
2      10.4428196         12.28183         1.467686             3079
3      10.4428196         12.28183         1.467686             3079
  wc2.1_10m_bio_13 wc2.1_10m_bio_14 wc2.1_10m_bio_15 wc2.1_10m_bio_16
1              120               42         31.32536              337
2              448              141         35.27518             1127
3              448              141         35.27518             1127
  wc2.1_10m_bio_17 wc2.1_10m_bio_18 wc2.1_10m_bio_19
1              157              288              216
2              468              630              873
3              468              630              873
bfc <- bfc[,-1]
plot(bfc[ ,"wc2.1_10m_bio_1"], bfc[, "wc2.1_10m_bio_12"], col="red",
        xlab="Annual mean temperature (°C)", ylab="Annual precipitation (mm)")

ext_bf <- ext(vect(bf[, 1:2])) + 1
ext_bf
SpatExtent : -157.75, -63.4627, 24.141, 70.5 (xmin, xmax, ymin, ymax)
set.seed(0)
window(wc) <- ext_bf
bg <- spatSample(wc, 5000, "random", na.rm=TRUE, xy=TRUE)
head(bg)
          x        y wc2.1_10m_bio_1 wc2.1_10m_bio_2 wc2.1_10m_bio_3
1  -99.2500 66.75000     -13.2934895        7.870646        14.96619
2 -106.0833 42.08333       5.6722708       14.530958        36.82943
3 -111.9167 46.58333       6.7605939       14.135854        35.23372
4 -106.9167 54.75000       0.4086979       11.528605        24.43290
5 -118.2500 67.08333      -9.1363859        8.185354        16.34505
6 -111.2500 38.91667       8.4194584       15.997125        38.84047
  wc2.1_10m_bio_4 wc2.1_10m_bio_5 wc2.1_10m_bio_6 wc2.1_10m_bio_7
1       1638.6833        15.42850       -37.16100        52.58950
2        894.3715        27.86600       -11.58875        39.45475
3        927.7927        28.14375       -11.97650        40.12025
4       1290.1088        22.55225       -24.63250        47.18475
5       1567.0846        17.46575       -32.61275        50.07850
6        904.0610        30.49050       -10.69625        41.18675
  wc2.1_10m_bio_8 wc2.1_10m_bio_9 wc2.1_10m_bio_10 wc2.1_10m_bio_11
1        6.484917      -31.617332         7.518209        -31.76942
2        9.226916       -4.839750        17.168291         -4.83975
3       15.638333       -4.921750        18.186209         -4.92175
4       15.417084      -13.864500        15.417084        -16.31392
5        8.609292      -21.353209        10.573625        -27.49783
6       19.076958        3.179209        19.812834         -2.50475
  wc2.1_10m_bio_12 wc2.1_10m_bio_13 wc2.1_10m_bio_14 wc2.1_10m_bio_15
1              171               33                4         70.29919
2              288               42               13         38.78144
3              293               48                9         53.40759
4              471               86               16         58.32499
5              223               43                7         61.21693
6              228               28               11         32.40370
  wc2.1_10m_bio_16 wc2.1_10m_bio_17 wc2.1_10m_bio_18 wc2.1_10m_bio_19
1               90               13               78               13
2              112               41               90               41
3              129               35              115               35
4              220               53              220               56
5              108               27               93               29
6               83               40               72               44
plot(bg[, c("x", "y")])

bg <- bg[, -c(1:2)]
plot(bg[,1], bg[,12], xlab="Annual mean temperature (°C)",
          ylab="Annual precipitation (mm)", cex=.8)
points(bfc[,1], bfc[,12], col="red", cex=.6, pch="+")
legend("topleft", c("observed", "background"), col=c("red", "black"), pch=c("+", "o"), pt.cex=c(.6, .8))

#eastern points
bfe <- bfc[bf[,1] > -102, ]
#western points
bfw <- bfc[bf[,1] <= -102, ]
dw <- rbind(cbind(pa=1, bfw), cbind(pa=0, bg))
de <- rbind(cbind(pa=1, bfe), cbind(pa=0, bg))
dw <- data.frame(dw)
de <- data.frame(na.omit(de))

library(rpart)
cart <- rpart(pa~., data=dw)
printcp(cart)

Regression tree:
rpart(formula = pa ~ ., data = dw)

Variables actually used in tree construction:
[1] wc2.1_10m_bio_10 wc2.1_10m_bio_12 wc2.1_10m_bio_14 wc2.1_10m_bio_18
[5] wc2.1_10m_bio_19 wc2.1_10m_bio_3  wc2.1_10m_bio_4 

Root node error: 983.29/6224 = 0.15798

n= 6224 

        CP nsplit rel error  xerror     xstd
1 0.322797      0   1.00000 1.00019 0.019351
2 0.080521      1   0.67720 0.68771 0.019687
3 0.073325      2   0.59668 0.59759 0.015480
4 0.068645      3   0.52336 0.52163 0.015231
5 0.027920      4   0.45471 0.47026 0.014758
6 0.014907      5   0.42679 0.44778 0.015136
7 0.010869      6   0.41188 0.44082 0.015406
8 0.010197      7   0.40102 0.43500 0.015330
9 0.010000      8   0.39082 0.43197 0.015319
plotcp(cart)

cart <- rpart(pa~., data=dw, cp=0.02)
library(rpart.plot)
rpart.plot(cart, uniform=TRUE, main="Regression Tree")

Question 1: What are the environmental conditions that Bigfoot appears to enjoy most?

The conditions that bigfoot likes most are lots of precipitation during the coldest quarter but little precipitation during the driest month.

x <- predict(wc, cart)
x <- mask(x, wc[[1]])
x <- round(x, 2)
plot(x, type="class", plg=list(x="bottomleft"))

set.seed(123)
i <- sample(nrow(dw), 0.2 * nrow(dw))
test <- dw[i,]
train <- dw[-i,]

fpa <- as.factor(train[, 'pa'])

library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
crf <- randomForest(train[, 2:ncol(train)], fpa)
crf

Call:
 randomForest(x = train[, 2:ncol(train)], y = fpa) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 4

        OOB estimate of  error rate: 7.19%
Confusion matrix:
     0   1 class.error
0 3832 165  0.04128096
1  193 790  0.19633774
varImpPlot(crf)

trf <- tuneRF(train[, 2:ncol(train)], train[, "pa"])
mtry = 6  OOB error = 0.05610547 
Searching left ...
mtry = 3    OOB error = 0.05687885 
-0.0137843 0.05 
Searching right ...
mtry = 12   OOB error = 0.05726308 
-0.02063272 0.05 

mt <- trf[which.min(trf[,2]), 1]
mt
[1] 6

Question 2: What did tuneRF help us find? What does the values of mt represent?

tuneRF allows us to find the optival value for mtry, which is the number of variables randomly sampled to use as predictors at each split.

The variable mt is the number of variables to use at each split that minimizes out-of-bag-error. Interestingly, when I run it I get 6 instead of 12 as in the textbook.

rrf <- randomForest(train[, 2:ncol(train)], train[, "pa"], mtry=mt, ntree=250)
rrf

Call:
 randomForest(x = train[, 2:ncol(train)], y = train[, "pa"], ntree = 250,      mtry = mt) 
               Type of random forest: regression
                     Number of trees: 250
No. of variables tried at each split: 6

          Mean of squared residuals: 0.05436819
                    % Var explained: 65.68
plot(rrf)

Question 3: What does plot(rrf) show us?

The plot shows us how out of bag error rates decrease as the number of trees are increased. In this case, after about 50 trees the error rates stabilizes, so there is no need to go beyond that.

rp <- predict(wc, rrf, na.rm=TRUE)
plot(rp)

library(predicts)
eva <- pa_evaluate(predict(rrf, test[test$pa==1, ]), predict(rrf, test[test$pa==0, ]))
eva
@stats
   np   na prevalence   auc   cor pcor   ODP
1 241 1003      0.194 0.967 0.802    0 0.806

@thresholds
  max_kappa max_spec_sens no_omission equal_prevalence equal_sens_spec
1     0.443         0.346       0.018            0.194           0.213

@tr_stats
    treshold kappa  CCR  TPR  TNR  FPR  FNR  PPP  NPP  MCR  OR
1          0     0 0.19    1    0    1    0 0.19  NaN 0.81 NaN
2          0  0.22 0.53    1 0.42 0.58    0 0.29    1 0.47 Inf
3          0  0.22 0.53    1 0.42 0.58    0 0.29    1 0.47 Inf
4        ...   ...  ...  ...  ...  ...  ...  ...  ...  ... ...
627        1  0.05 0.81 0.03    1    0 0.97    1 0.81 0.19 Inf
628        1  0.05 0.81 0.03    1    0 0.97    1 0.81 0.19 Inf
629        1     0 0.81    0    1    0    1  NaN 0.81 0.19 NaN
plot(eva, "ROC")

par(mfrow=c(1,2))
plot(eva, "boxplot")
plot(eva, "density")

tr <- eva@thresholds
tr
  max_kappa max_spec_sens no_omission equal_prevalence equal_sens_spec
1 0.4432333     0.3457667  0.01799187        0.1942453       0.2126333
plot(rp > tr$max_spec_sens)

rc <- predict(wc, crf, na.rm=TRUE)
plot(rc)

rc2 <- predict(wc, crf, type="prob", na.rm=TRUE)
plot(rc2, 2)

eva2 <- pa_evaluate(predict(rrf, de[de$pa==1, ]), predict(rrf, de[de$pa==0, ]))
eva2
@stats
    np   na prevalence   auc    cor pcor   ODP
1 1866 5000      0.272 0.476 -0.149    0 0.728

@thresholds
  max_kappa max_spec_sens no_omission equal_prevalence equal_sens_spec
1         0             0           0             0.27           0.001

@tr_stats
    treshold kappa  CCR  TPR  TNR  FPR  FNR  PPP  NPP  MCR   OR
1          0     0 0.27    1    0    1    0 0.27  NaN 0.73  NaN
2          0 -0.01 0.48 0.52 0.47 0.53 0.48 0.27 0.72 0.52 0.94
3          0 -0.01 0.48 0.52 0.47 0.53 0.48 0.27 0.72 0.52 0.94
4        ...   ...  ...  ...  ...  ...  ...  ...  ...  ...  ...
530     0.99     0 0.73    0    1    0    1    0 0.73 0.27    0
531     0.99     0 0.73    0    1    0    1    0 0.73 0.27    0
532     0.99     0 0.73    0    1    0    1  NaN 0.73 0.27  NaN
par(mfrow=c(1,2))
plot(eva2, "ROC")
plot(eva2, "boxplot")

plot(rc)
points(bf[,1:2], cex=.15)

Question 4: Why would it be that the model does not extrapolate well?

The model does not extrapolate well because the environmental conditions in the eastern US are different from those in the western US and the eastern bigfoot may prefer different environmental conditions to the western bigfoot. Whatever the conditions that the eastern bigfoot prefers, they are not in the training data.

window(wc) <- NULL
pm <- predict(wc, rrf, na.rm=TRUE)
plet(pm)

Question 5: What are some countries that should consider Bigfoot as a potential invasive species?

Portugal, Spain, Chile, and Turkey are all good candidates for bigfoot’s next vacation, with potential for Northern China, Mongolia, Morrocco, France, and Greece